regression modeling

安装量: 112
排名: #7642

安装

npx skills add https://github.com/aj-geddes/useful-ai-prompts --skill 'Regression Modeling'
Regression Modeling
Overview
Regression modeling predicts continuous target values based on input features, establishing quantitative relationships between variables for forecasting and analysis.
When to Use
Predicting sales, prices, or other continuous numerical outcomes
Understanding relationships between independent and dependent variables
Forecasting trends based on historical data
Quantifying the impact of features on a target variable
Building baseline models for comparison with more complex algorithms
Identifying which variables most influence predictions
Regression Types
Linear Regression
Straight-line fit to data
Polynomial Regression
Non-linear relationships
Ridge (L2)
Regularization to prevent overfitting
Lasso (L1)
Feature selection through regularization
ElasticNet
Combines Ridge and Lasso
Robust Regression
Resistant to outliers
Key Metrics
R² Score
Proportion of variance explained
RMSE
Root Mean Squared Error
MAE
Mean Absolute Error
AIC/BIC
Model comparison criteria Implementation with Python import pandas as pd import numpy as np import matplotlib . pyplot as plt from sklearn . linear_model import ( LinearRegression , Ridge , Lasso , ElasticNet , HuberRegressor ) from sklearn . preprocessing import PolynomialFeatures , StandardScaler from sklearn . model_selection import train_test_split , cross_val_score from sklearn . metrics import mean_squared_error , r2_score , mean_absolute_error import seaborn as sns

Generate sample data

np . random . seed ( 42 ) X = np . random . uniform ( 0 , 100 , 200 ) . reshape ( - 1 , 1 ) y = 2.5 * X . squeeze ( ) + 30 + np . random . normal ( 0 , 50 , 200 ) X_train , X_test , y_train , y_test = train_test_split ( X , y , test_size = 0.2 )

Linear Regression

lr_model

LinearRegression ( ) lr_model . fit ( X_train , y_train ) y_pred_lr = lr_model . predict ( X_test ) print ( "Linear Regression:" ) print ( f" R² Score: { r2_score ( y_test , y_pred_lr ) : .4f } " ) print ( f" RMSE: { np . sqrt ( mean_squared_error ( y_test , y_pred_lr ) ) : .4f } " ) print ( f" Coefficient: { lr_model . coef_ [ 0 ] : .4f } " ) print ( f" Intercept: { lr_model . intercept_ : .4f } " )

Polynomial Regression (degree 2)

poly

PolynomialFeatures ( degree = 2 ) X_train_poly = poly . fit_transform ( X_train ) X_test_poly = poly . transform ( X_test ) poly_model = LinearRegression ( ) poly_model . fit ( X_train_poly , y_train ) y_pred_poly = poly_model . predict ( X_test_poly ) print ( "\nPolynomial Regression (degree=2):" ) print ( f" R² Score: { r2_score ( y_test , y_pred_poly ) : .4f } " ) print ( f" RMSE: { np . sqrt ( mean_squared_error ( y_test , y_pred_poly ) ) : .4f } " )

Ridge Regression (L2 regularization)

ridge_model

Ridge ( alpha = 1.0 ) ridge_model . fit ( X_train , y_train ) y_pred_ridge = ridge_model . predict ( X_test ) print ( "\nRidge Regression (alpha=1.0):" ) print ( f" R² Score: { r2_score ( y_test , y_pred_ridge ) : .4f } " ) print ( f" RMSE: { np . sqrt ( mean_squared_error ( y_test , y_pred_ridge ) ) : .4f } " )

Lasso Regression (L1 regularization)

lasso_model

Lasso ( alpha = 0.1 ) lasso_model . fit ( X_train , y_train ) y_pred_lasso = lasso_model . predict ( X_test ) print ( "\nLasso Regression (alpha=0.1):" ) print ( f" R² Score: { r2_score ( y_test , y_pred_lasso ) : .4f } " ) print ( f" RMSE: { np . sqrt ( mean_squared_error ( y_test , y_pred_lasso ) ) : .4f } " )

ElasticNet Regression

elastic_model

ElasticNet ( alpha = 0.1 , l1_ratio = 0.5 ) elastic_model . fit ( X_train , y_train ) y_pred_elastic = elastic_model . predict ( X_test ) print ( "\nElasticNet Regression:" ) print ( f" R² Score: { r2_score ( y_test , y_pred_elastic ) : .4f } " ) print ( f" RMSE: { np . sqrt ( mean_squared_error ( y_test , y_pred_elastic ) ) : .4f } " )

Robust Regression (resistant to outliers)

huber_model

HuberRegressor ( max_iter = 1000 , alpha = 0.1 ) huber_model . fit ( X_train , y_train ) y_pred_huber = huber_model . predict ( X_test ) print ( "\nHuber Regression (Robust):" ) print ( f" R² Score: { r2_score ( y_test , y_pred_huber ) : .4f } " ) print ( f" RMSE: { np . sqrt ( mean_squared_error ( y_test , y_pred_huber ) ) : .4f } " )

Visualization

fig , axes = plt . subplots ( 2 , 3 , figsize = ( 15 , 8 ) ) models_data = [ ( X_test , y_test , y_pred_lr , 'Linear' ) , ( X_test_poly , y_test , y_pred_poly , 'Polynomial (deg=2)' ) , ( X_test , y_test , y_pred_ridge , 'Ridge' ) , ( X_test , y_test , y_pred_lasso , 'Lasso' ) , ( X_test , y_test , y_pred_elastic , 'ElasticNet' ) , ( X_test , y_test , y_pred_huber , 'Huber' ) , ] for idx , ( X_p , y_t , y_p , label ) in enumerate ( models_data ) : if label in [ 'Polynomial (deg=2)' ] : x_plot = X_p [ : , 1 ]

Use quadratic feature for plotting

else : x_plot = X_p ax = axes [ idx // 3 , idx % 3 ] ax . scatter ( x_plot , y_t , alpha = 0.5 , label = 'Actual' ) ax . scatter ( x_plot , y_p , alpha = 0.5 , color = 'red' , label = 'Predicted' ) ax . set_title ( f' { label } \nR²= { r2_score ( y_t , y_p ) : .4f } ' ) ax . legend ( ) ax . grid ( True , alpha = 0.3 ) plt . tight_layout ( ) plt . show ( )

Residual analysis

fig , axes = plt . subplots ( 1 , 2 , figsize = ( 12 , 4 ) ) residuals = y_test - y_pred_lr axes [ 0 ] . scatter ( y_pred_lr , residuals , alpha = 0.5 ) axes [ 0 ] . axhline ( y = 0 , color = 'r' , linestyle = '--' ) axes [ 0 ] . set_title ( 'Residual Plot' ) axes [ 0 ] . set_xlabel ( 'Fitted Values' ) axes [ 0 ] . set_ylabel ( 'Residuals' ) axes [ 1 ] . hist ( residuals , bins = 20 , edgecolor = 'black' ) axes [ 1 ] . set_title ( 'Residuals Distribution' ) axes [ 1 ] . set_xlabel ( 'Residuals' ) axes [ 1 ] . set_ylabel ( 'Frequency' ) plt . tight_layout ( ) plt . show ( )

Cross-validation

cv_scores

cross_val_score ( LinearRegression ( ) , X , y , cv = 5 , scoring = 'r2' ) print ( f"\nCross-validation R² scores: { cv_scores } " ) print ( f"Mean CV R²: { cv_scores . mean ( ) : .4f } (+/- { cv_scores . std ( ) : .4f } )" )

Regularization parameter tuning

alphas

np . logspace ( - 3 , 3 , 100 ) ridge_scores = [ ] for alpha in alphas : ridge = Ridge ( alpha = alpha ) scores = cross_val_score ( ridge , X_train , y_train , cv = 5 , scoring = 'r2' ) ridge_scores . append ( scores . mean ( ) ) best_alpha_idx = np . argmax ( ridge_scores ) best_alpha = alphas [ best_alpha_idx ] plt . figure ( figsize = ( 10 , 5 ) ) plt . semilogx ( alphas , ridge_scores ) plt . axvline ( x = best_alpha , color = 'red' , linestyle = '--' , label = f'Best alpha= { best_alpha : .4f } ' ) plt . xlabel ( 'Alpha (Regularization Strength)' ) plt . ylabel ( 'Cross-validation R² Score' ) plt . title ( 'Ridge Regression: Alpha Tuning' ) plt . legend ( ) plt . grid ( True , alpha = 0.3 ) plt . show ( )

Feature importance (coefficients)

if hasattr ( lr_model , 'coef_' ) : print ( f"\nModel Coefficients: { lr_model . coef_ } " )

Additional evaluation and diagnostics

Model prediction intervals

from scipy import stats as sp_stats predictions = lr_model . predict ( X_test ) residuals = y_test - predictions mse = np . mean ( residuals ** 2 ) rmse = np . sqrt ( mse )

Prediction intervals (95%)

n

len ( X_test ) p = X_test . shape [ 1 ] dof = n - p - 1 t_val = sp_stats . t . ppf ( 0.975 , dof ) margin = t_val * np . sqrt ( mse * ( 1 + 1 / n ) ) pred_intervals = np . column_stack ( [ predictions - margin , predictions + margin ] ) print ( f"\nPrediction Intervals (95%):" ) print ( f"First prediction: { predictions [ 0 ] : .2f } [ { pred_intervals [ 0 , 0 ] : .2f } , { pred_intervals [ 0 , 1 ] : .2f } ]" )

Variance inflation factors for multicollinearity

from statsmodels . stats . outliers_influence import variance_inflation_factor vif_data = pd . DataFrame ( ) vif_data [ "Feature" ] = X_test . columns if hasattr ( X_test , 'columns' ) else range ( X_test . shape [ 1 ] ) try : vif_data [ "VIF" ] = [ variance_inflation_factor ( X . values , i ) for i in range ( X . shape [ 1 ] ) ] print ( "\nVariance Inflation Factor (VIF):" ) print ( vif_data ) except : print ( "VIF calculation skipped (insufficient features)" )

Prediction by group/segment

if
hasattr
(
X_test
,
'columns'
)
:
segment_results
=
{
}
for
feat
in
X_test
.
columns
[
:
2
]
:
q1
,
q3
=
X_test
[
feat
]
.
quantile
(
[
0.25
,
0.75
]
)
low
=
X_test
[
X_test
[
feat
]
<=
q1
]
high
=
X_test
[
X_test
[
feat
]
>=
q3
]
if
len
(
low
)
>
0
and
len
(
high
)
>
0
:
low_pred_rmse
=
np
.
sqrt
(
np
.
mean
(
(
y_test
[
low
.
index
]
-
lr_model
.
predict
(
low
)
)
**
2
)
)
high_pred_rmse
=
np
.
sqrt
(
np
.
mean
(
(
y_test
[
high
.
index
]
-
lr_model
.
predict
(
high
)
)
**
2
)
)
segment_results
[
feat
]
=
{
'Low RMSE'
:
low_pred_rmse
,
'High RMSE'
:
high_pred_rmse
,
}
if
segment_results
:
print
(
f"\nSegment Performance:"
)
for
feat
,
results
in
segment_results
.
items
(
)
:
print
(
f"
{
feat
}
Low=
{
results
[
'Low RMSE'
]
:
.2f
}
, High=
{
results
[
'High RMSE'
]
:
.2f
}
"
)
print
(
"\nRegression model evaluation complete!"
)
Assumption Checking
Linearity
Relationship is linear
Independence
Observations are independent
Homoscedasticity
Constant variance of errors
Normality
Errors are normally distributed
No multicollinearity
Features not highly correlated
Model Selection
Simple data
Linear regression
Non-linear patterns
Polynomial regression
Many features
Lasso or ElasticNet
Outliers
Robust regression
Prevent overfitting
Ridge or ElasticNet Deliverables Fitted models with coefficients R² and RMSE metrics Residual plots and analysis Cross-validation results Regularization parameter tuning curves Model comparison summary Predictions with confidence intervals
返回排行榜